package com.meiyuetao.myt.crawl.filter;

import java.util.List;
import java.util.Map;
import java.util.Set;

import lab.s2jh.crawl.filter.ParseFilterChain;

import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.google.common.collect.Sets;

/**
 * 实现天猫商铺所有商品抓取
 */
public class TaoBaoCategoryParseFilter extends AbstractCommodityParseFilter {

    private final Logger logger = LoggerFactory.getLogger(TaoBaoCategoryParseFilter.class);

    @Override
    public void doFilterInternal(String url, ParseFilterChain filterChain) {
        logger.debug("Invoking {} ...", this.getClass());
        // http://iyoubox.taobao.com/category.htm?spm=a1z10.3.w4002-4158124886.41.gOPHPk&mid=w-4158124886-0&search=y&pageNo=1
        // http://disney.tmall.com/category.htm?spm=a1z10.5.w4011-2440936552.401.4aSw9J&mid=w-2440936552-0&pageNo=1#anchor
        String pageNo = StringUtils.substringAfter(url, "pageNo=");
        int pager = 1;
        boolean continueLoop = true;
        if (!pageNo.equals("*")) {
            pager = Integer.valueOf(pageNo);
            continueLoop = false;
        }

        String pageNoBefore = StringUtils.substringBefore(url, "pageNo=");

        do {
            String catPageUrl = pageNoBefore + "pageNo=" + (pager++);
            logger.info("Process category page: " + catPageUrl);
            HtmlPage catPage = fetchHtmlPage(catPageUrl);

            @SuppressWarnings("rawtypes")
            List links = catPage.getByXPath("//DIV[@class='skin-box-bd']//DIV[@class='shop-hesper-bd grid']//DIV[@class='item3line1']//DL//DD[@class='detail']//A");
            if (links == null || links.size() == 0) {
                System.out.println("No more valid links...");
                continueLoop = false;
            } else {
                Set<String> pageUrls = Sets.newHashSet();
                for (Object element : links) {
                    HtmlAnchor anchor = (HtmlAnchor) element;
                    String href = anchor.getAttribute("href");
                    // http://detail.tmall.com/item.htm?spm=a1z10.3.w4011-2877147662.72.ZAiF31&id=17347946555&rn=93461b8808ac12eb3c7007291f23c337
                    if (href.startsWith("http://item.taobao.com/item.htm?")) {
                        logger.info("Found new valid url: {}", href);
                        pageUrls.add(href);
                    }
                }
                crawlService.injectUrls(pageUrls.toArray(new String[] {}));
            }
        } while (continueLoop);
    }

    @Override
    public Map<String, Object> parseSimpleData(String url) {
        // TODO Auto-generated method stub
        return null;
    }

}
